#ifndef _UTILITY_H_
#define _UTILITY_H_

//#include <iostream>
#include <vector>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <string.h>
#include <ctype.h>

// CUDA runtime library
#include <cuda_runtime.h>
#include <curand_kernel.h>

using namespace std;

#define CPU_MODE 0
#define GPU_MODE 1

#define EXACT_GUESS 0
#define MODEL_SELECTION 1

#define SCREENING_KSA 0
#define SCREENING_CHISQUARE 1

#define CHI_SQUARE_THRESHOLD_DF1 10.828f // TABLE VALUE OF CHI_SQUARE_DISTRIBUTION DF=1, value=0.001

#define DEFAULT_PREFILTERING_VALUE 99999.0f
#define DEL 0.000001f

// General Uility functions for BOOST
// type defintions for BOOST
typedef long long   int64;
typedef unsigned long long uint64;
#define FMT_INT64   "%lld"
#define FMT_UINT64   "%llu"
#define FMT_HEX64   "%llx"

#define MarginalDistrSNP_Y_DimensionX 2
#define MarginalDistrSNP_Y_DimensionY 3

// static variable for the precomput bin count of 64 bit string
static unsigned char wordbits[65536];// { bitcounts of ints between 0 and 65535 };

// compute number of 1s in 64 bit string
static int popcount( uint64 i )
{
	return( wordbits[i&0xFFFF] + wordbits[(i>>16)&0xFFFF] + wordbits[(i>>32)&0xFFFF] + wordbits[i>>48]);
}

struct KernelParams {
	int numThread;
	int numBlock;
	int numPermutation;
	int deviceId;
	int isMarginal;
	int output2screen;
};

// compute the number of 1s in 64 bit string (alternative implementation)
int bitCount(uint64 i);

// compute the absolute value of double
double Abs(double a);

// convert string to upper case
void toUpperCaseString(char* inputString, int strLen);

// get the data size of an input list file of BOOST program
int GetDataSize(char *filename, int **DataSize);

// calculate the marginal entropy
void CalculateMarginalEntropy(uint64* genocase, uint64* genoctrl, int nsnp, int n, int nlongintcase, int nlongintctrl, double *MarginalEntropySNP, double *MarginalEntropySNP_Y);

// calculate the marginal distribution
void CalculateMarginalDistr(uint64* genocase, uint64* genoctrl, int nsnp, int n, int nlongintcase, int nlongintctrl, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y);

// calculate the genome joint distribution
void CalculateGenoJointDistr(uint64* genocase, uint64* genoctrl, int nsnp, int nLongIntcase, int nLongIntctrl, int *GenoDistr, int j1, int j2, int* pMarginalDistrSNP_Y);

// calculate the chi-square value of a model
float CalculateChiSquareOfModel(int* input[4]);

// CUDA function headers
#include <list>

//#define THREAD_NUM 256
//#define BLOCK_NUM 10000

// for detecting available GPU with CUDA
int meetCUDARequirement();

// initialize CUDA driver
int initCUDA(int deviceId);

class DeviceProperties {
public:
	DeviceProperties();
	~DeviceProperties();
	int getDeviceCount();
	cudaDeviceProp getDeviceProp(int i);
	void printDevProp(int i);
private:
	cudaDeviceProp* devPropArray;
	int devCount;
};

extern "C" void cuda_SetWordBits(const unsigned char* wordBits, int count);

// C++ calling function for permute
int permGPU(char *inputFilename, char *indexFile, char *outputFilePrefix, struct KernelParams);

extern "C" int cuda_permute(uint64 *hostGenoData, int nLongIntSample, int nSample, int nCase, int nSNP);

extern "C" int cuda_permute_marginal(uint64 *hostGenoData, int nLongIntSample, int nSample, int nCase, int nSNP, int *position, double *statistics, double *p_value, char *outputPrefix, struct KernelParams kernelParams);

extern "C" int cuda_permute_interaction(uint64 *hostGenoData, int nLongIntSample, int nSample, int nCase, int nSNP, int *position, double *statistics, double *p_value, char *outputPrefix, struct KernelParams kernelParams);

extern "C" int readDataSizeAndLoad(char *inputFilename, char *indexFile, uint64 **genoData, int *nSample, int *nCase, int *nSNP, int **pos, double **sta, double **p_val, int isMarginal);

extern "C" __device__ void getMarginalDistr(uint64 *labels, int nLongIntSample, int nPair, struct MarginalDistr *margin);

extern "C" __device__ void getJointDistr(uint64 *labels, int nLongIntSample, int nSample, int nPair, struct MarginalDistr *margin, int *jointDistr);

extern "C" __device__ double calStatistics(int *jointDistr, int nCase, int nCtrl);

extern "C" __global__ void devInitRand(unsigned int *devDirectionVectors32, curandStateSobol32_t *devSobol32State);

// use to flush stdout regularly
#define FLUSH_STDOUT() fflush(stdout)

#endif
